In [18]:
from __future__ import print_function
import pandas as pd
import pickle
import numpy as np
from itertools import chain
from collections import OrderedDict
%load_ext autoreload
import random
In [22]:
# Load the bar review dataset
review = pd.read_pickle('../output/bar_reviews_cleaned_and_tokenized_SF.pickle')
review.head(5)
df_businesses = pd.read_pickle('../input/yelp_academic_dataset_business_SF.pickle')
city_state_list = list(set([df_businesses.city.iloc[i_city]+', '+df_businesses.state.iloc[i_city] for i_city, city in enumerate(df_businesses.city)]))[1:]
import pickle
pickle.dump(city_state_list, open('../output/city_state_list.pickle','wb'))
In [23]:
review.tail(5)
Out[23]:
In [21]:
"deb75413-f53d-4f35-a403-d7d0048e2c97"
Out[21]:
In [24]:
# Drop 20% of the users from the dataset for testing
user_set = list(set(review.user_id.values[:]))
random.seed(0)
shuffle(user_set) # Randomize
n_users = float(len(user_set))
user_set_training = user_set[:int(n_users*float(0.8))]
with open('../output/training_users.pickle', 'wb') as f:
pickle.dump(user_set_training, f)
# Save a test set
test_users = user_set[int(n_users*float(0.8)):]
with open('../output/test_users.pickle', 'wb') as f:
pickle.dump(test_users, f)
# Make the active review set training only
review = review[review.user_id.isin(user_set_training)]
In [11]:
"deb75413-f53d-4f35-a403-d7d0048e2c97" in user_set_training
Out[11]:
In [25]:
# This is for review level not business level
# docs = [" ".join(list(chain.from_iterable(l))) for l in review.cleaned_tokenized.iloc[:]]
n_reviews = -1 # all of them...
# Flatten the reviews, so each review is just a single list of words.
reviews_merged_bus = OrderedDict()
business_set = set(review.business_id.values[:n_reviews])
for i_bus, bus_id in enumerate(business_set):
if ((i_bus%2)==0):
print ('\r Fraction Processed',float(i_bus+1)/len(business_set), end="")
# This horrible line first collapses each review of a corresponding business into a list
# of lists, and then collapses the list of sentences to a long list of words
reviews_merged_bus[bus_id] = " ".join(list(chain.from_iterable(
chain.from_iterable( review.cleaned_tokenized[review.business_id==bus_id] ))))
docs_bus = reviews_merged_bus.values()
with open('../output/docs_bars_bus.pickle', 'wb') as f:
pickle.dump(docs_bus, f)
with open('../output/bus_ids_bars_LDA.pickle', 'wb') as f:
pickle.dump(reviews_merged_bus.keys(), f)
In [71]:
In [4]:
# Flatten the reviews, so each review is just a single list of words.
# reviews_merged_user = OrderedDict()
# user_set = list(set(review.user_id.values[:n_reviews]))
# n_users = float(len(user_set))
# for i_user, user_id in enumerate(user_set[:]):
# if ((i_user%50)==0):
# print ('\r Fraction Processed',float(i_user+1)/n_users, end="")
# # This horrible line first collapses each review of a corresponding user reviews into a list
# # of lists, and then collapses the list of sentences to a long list of words
# reviews_merged_user[user_id] = " ".join(list(chain.from_iterable(
# chain.from_iterable( review.cleaned_tokenized[review.user_id==user_id] ))))
# docs_users = reviews_merged_user.values()
# print()
# print("Merging Done...")
# with open('../output/docs_bars_users.pickle', 'wb') as f:
# pickle.dump(docs_users, f)
In [26]:
# Flatten the reviews, so each review is just a single list of words.
docs_reviews = [" ".join(list(chain.from_iterable(rev))) for rev in review.cleaned_tokenized.values[:n_reviews]]
with open('../output/docs_reviews.pickle', 'wb') as f:
pickle.dump(docs_reviews, f)
In [ ]:
In [ ]:
In [27]:
%autoreload 2
import sys
sys.path.append('../vectorsearch/')
import LDA
reload(LDA)
n_topics=30
n_features=10000
max_df=.75
min_df=3
max_iter=10
alpha=6./n_topics
In [ ]:
# Train the bar set over businesses
#doc_users = pickle.load(open('../output/docs_bars_users.pickle', 'rb'))
lda_bus = LDA.LDA(alpha=alpha, n_topics=n_topics, n_features=n_features, max_df=max_df, min_df=min_df, max_iter=max_iter,)
lda_bus.vectorizecounts(docs_bus)
lda_bus.fitLDA()
LDA.SaveLDAModel('../output/LDA_model_bus.pickle', lda_bus)
# Now can
In [7]:
# Train the bar set over users
# doc_users = pickle.load(open('../output/docs_bars_users.pickle', 'rb'))
# lda_user = LDA.LDA(n_topics=n_topics, n_features=n_features, max_df=max_df, min_df=min_df, max_iter=max_iter,)
# lda_user.vectorizecounts(docs_users)
# lda_user.fitLDA()
# LDA.SaveLDAModel('../output/LDA_model_user.pickle', lda_user)
In [12]:
# Train the bar set over users
lda_reviews = pickle.load(open('../output/docs_reviews.pickle', 'rb'))
lda_reviews = LDA.LDA(alpha=alpha, n_topics=n_topics, n_features=n_features, max_df=max_df, min_df=min_df, max_iter=max_iter,)
lda_reviews.vectorizecounts(docs_reviews)
lda_reviews.fitLDA()
LDA.SaveLDAModel('../output/LDA_model_reviews.pickle', lda_reviews)
In [51]:
# doc_users = pickle.load(open('../output/docs_bars_users.pickle', 'rb'))
# lda_user = LDA.LDA(n_topics=n_topics, n_features=n_features, max_df=max_df, min_df=min_df, max_iter=max_iter,)
# lda_user.vectorizecounts(docs_users)
# lda_user.fitLDA()
# LDA.SaveLDAModel('../output/LDA_model_user.pickle', lda_user)
#lda_bus.print_top_words(10)
#.get_doc_topics(doc_users[10:12])
In [ ]:
import sys
sys.path.append('../vectorsearch/')
import LDA
bus_lda = LDA.LoadLDAModel('../output/LDA_model_bus.pickle')
In [ ]:
In [ ]:
# The topic vector for a given business is given by this dataframe.
bus_lda_ids = pickle.load(open('../output/bus_ids_bars_LDA.pickle', 'rb'))
bus_vectors = pd.DataFrame()
bus_vectors['business_id'] = bus_lda_ids
transformed = bus_lda.lda.transform(bus_lda.tf)
In [ ]:
print(transformed.shape)
print(len(bus_vectors))
bus_vectors['topic_vector'] = [bus_topic_vec for bus_topic_vec in transformed]
normed_topic_vecs = map(lambda topic_vec: topic_vec/sqrt(np.dot(topic_vec, topic_vec)),
bus_vectors.topic_vector)
bus_vectors.topic_vector = normed_topic_vecs
bus_vectors.to_pickle('../output/business_LDA_vectors.pickle')
In [ ]:
# Visualizationlda_reviews.get_doc_topics(doc_reviews[10:20])
In [1]:
# import pyLDAvis
# import pandas as pd
# import funcy as fp
# from pyLDAvis import prepare as vis_prepare
# def _extract_data(docs, vect, lda):
# #LDA scikit-learn implementation seems to have buggy code.
# #Topic_term_dists and doc_topic_dists isn't accummulated to 1.
# #Hence norm function implemented to normalize the distributions.
# norm = lambda data: pd.DataFrame(data).div(data.sum(1),axis=0).values
# vected = vect.fit_transform(docs)
# doc_topic_dists = norm(lda.fit_transform(vected))
# return lda,vect, dict(
# doc_lengths = docs.str.len(),
# vocab = vect.get_feature_names(),
# term_frequency = vected.sum(axis=0).tolist()[0],
# topic_term_dists = norm(lda.components_),
# doc_topic_dists = doc_topic_dists)
# def prepare(docs, vect, lda, **kwargs):
# """Create Prepared Data from sklearn's vectorizer and Latent Dirichlet
# Application.
# Parameters
# ----------
# docs : Pandas Series.
# Documents to be passed as an input.
# vect : Scikit-Learn Vectorizer (CountVectorizer,TfIdfVectorizer).
# vectorizer to convert documents into matrix sparser
# lda : sklearn.decomposition.LatentDirichletAllocation.
# Latent Dirichlet Allocation
# **kwargs: Keyword argument to be passed to pyLDAvis.prepare()
# Returns
# -------
# prepared_data : PreparedData
# the data structures used in the visualization
# Example
# --------
# For example usage please see this notebook:
# http://nbviewer.ipython.org/github/bmabey/pyLDAvis/blob/master/notebooks/sklearn.ipynb
# See
# ------
# See `pyLDAvis.prepare` for **kwargs.
# """
# opts = fp.merge(_extract_data(docs, vect, lda)[2], kwargs)
# return vis_prepare(**opts)
# vis_data = prepare(docs, tf_vectorizer, lda)
# #
In [ ]:
In [13]:
import sys
sys.path.append('../vectorsearch/')
import LDA
bus_lda = LDA.LoadLDAModel('../output/LDA_model_bus.pickle')
In [15]:
bus_lda.lda.n_jobs = 1
In [ ]: